# save the average T1/T3 baseline Shannon diversity as individual variabledat_baselineDiv <- dat %>%filter(timepoint %in%c("T1", "T3")) %>%group_by(participant_id) %>%summarize(baseline_div =mean(diversity_shannon, na.rm =TRUE))dat <- dat %>% dplyr::left_join(dat_baselineDiv, by ="participant_id")# prepare data for regression modelsmedian_baselineDiv <-median(dat_baselineDiv$baseline_div)dat <- dat %>%filter(intervention !="FollowUp") %>%mutate(relAbd_log =log(relAbd +1),intervention =case_when(treatment =="Baseline"~"Baseline",TRUE~ intervention),age_centered50 = age -50,bmi_centered25 = bmi_t0 -25,age_cat =case_when(age <50~paste0(min(age), "-49"),TRUE~paste0("50-", max(age))),age_cat =factor(age_cat),bmi_cat =case_when(bmi_t0 <25~paste0("[", floor(min(bmi_t0)), ", 25)"),TRUE~paste0("[25, ", ceiling(max(bmi_t0)), "]")),bmi_cat =factor(bmi_cat),phase =case_when(timepoint %in%c("T3","T4") ~"phase 2",TRUE~"phase 1"),phase =factor(phase),dailyFiber_cat =case_when(dailyConsumption_fiber >=30~">= 30g",TRUE~"< 30g"),dailyFiber_cat =factor(dailyFiber_cat),baselineDiv_cat =case_when(baseline_div < median_baselineDiv ~"[2.8, 3.85)",TRUE~"[3.85, 4.4)"),baselineDiv_cat =factor(baselineDiv_cat))
Run screening
Run screening on 362 not-low-abundance species.
For each species, a Generalized Mixed Regression Model is estimated, identical to the models estimated for the blood measurements for the blood paper, with response parameter the relative abundance.
Model selection is performed by estimating either estimating (A) a Gaussian model, and (B) a Gaussian model on ‘log(y + 1)’ transformed values, or (A) a zero-inflated Gaussian model (with only an intercept predictor for the ZI part, as a general excess zero correction), and (B) a zero-inflated Gaussian model on ‘log(y + 1)’ transformed values for each gene and selecting the best performing one of the respective two as the final model. If at least 10% of a gene’s observations are zero values then the zero-inflated models are estimated, otherwise the non-zero-inflated models are estimated.
Notes:
For the non-zero-inflated models the best model is chosen based on the share of explained deviance
For the zero-inflated models the best model is chosen based on the lower average of all absolute residuals, since these models are estimated based on a quasi-likelihood approach and the deviance is not available.
Unused alternatives
a zero-inflated Poisson model could have been estimated on rounded values. However, a Poisson distribution is not able to account for more strongly right-skewed distributions with a stronger probability mass at zero
a Tweedie (p = 1.5) distribution would be a right-skewed continuous distribution with also potential probability mass at zero. However, since (1) it’s not a proper established distribution and (2) the used log transformation already accounts for right-skewedness we don’t need it and don’t use it
Correction for multiple testing is performed by applying the Benjamini-Hochberg (FDR) method to all interventions’ p-values.
Additionally, the intervention effects for every model are estimated in separate interactions with gender (male or female), age (above or below 50), baseline BMI (above or below 25), daily fiber consumption (above or below 30), baseline Shannon diversity (above or below tbd). Independently for every interaction variable (age, sex, BMI, fiber, diversity) a multiple testing correction is applied.
Note: For these screening analyses we use a significance threshold of 0.1 for q-values and 0.05 for uncorrected p-values.
The models are estimated with mgcv::gam() and NBZIMM::lme.zig().
Code
# helper function to estimate the models# - dat_model: Dataset used for model estimation# - model_type: One of c("Gaussian", "ZI Gaussian")# - model_formula: A model formula created with "as.formula"estimate_model <-function(dat_model, model_type, model_formula) { checkmate::assert_data_frame(dat_model) checkmate::assert_choice(model_type, choices =c("Gaussian", "ZI Gaussian")) checkmate::assert_class(model_formula, classes ="formula")if (model_type =="Gaussian") { model <-gam(model_formula, method ="REML", family ="gaussian", data = dat_model) } else { # model_type = "ZI Gaussian" model <-tryCatch({ lme.zig(fixed = model_formula,zi_fixed =~1,random =~1| participant_id,zi_random =NULL,data = dat_model) },error =function(e) { return(NULL) }) }return(model)}
# code for running the screening. It takes about 3 minutes on 15 cores# to estimate the results. Instead of newly estimating them each time# the html is generated, the results were saved after running the screening# once, and are now just read from the results csvdat_results <-read.csv("8_species_regressionScreening_results.csv")# species <- dat %>% pull(FeatureID) %>% unique() %>% as.character()# # # number of cores to use for the parallelized call# n_cores <- 15# # # create a cluster object for Windows parallelization# cl <- makeCluster(n_cores)# # # export necessary functions / data objects to the cluster workers# clusterExport(cl, c("runScreening_oneSpecies", "estimate_model", "estimate_interactionModels",# "species", "dat", "gam", "lme.zig", "summary.gam"))# # # run the screening (parallelized call)# datList_results <- parLapply(cl, species, runScreening_oneSpecies)# # # stop the Windows cluster# stopCluster(cl)# # # merge results datasets# dat_results <- datList_results %>% dplyr::bind_rows()# # # perform multiple testing correction and round values in the data# p_corrected <- p.adjust(c(dat_results$freshInt_pvalue, dat_results$pastInt_pvalue), method = "BH")# pAgeInt_corrected <- p.adjust(c(dat_results$freshInt_ageLow_pvalue, dat_results$pastInt_ageLow_pvalue,# dat_results$freshInt_ageHigh_pvalue, dat_results$pastInt_ageHigh_pvalue), method = "BH")# pSexInt_corrected <- p.adjust(c(dat_results$freshInt_sexM_pvalue, dat_results$pastInt_sexM_pvalue,# dat_results$freshInt_sexW_pvalue, dat_results$pastInt_sexW_pvalue), method = "BH")# pBMIInt_corrected <- p.adjust(c(dat_results$freshInt_bmiLow_pvalue, dat_results$pastInt_bmiLow_pvalue,# dat_results$freshInt_bmiHigh_pvalue, dat_results$pastInt_bmiHigh_pvalue), method = "BH")# pFibInt_corrected <- p.adjust(c(dat_results$freshInt_fibLow_pvalue, dat_results$pastInt_fibLow_pvalue,# dat_results$freshInt_fibHigh_pvalue, dat_results$pastInt_fibHigh_pvalue), method = "BH")# pDivInt_corrected <- p.adjust(c(dat_results$freshInt_divLow_pvalue, dat_results$pastInt_divLow_pvalue,# dat_results$freshInt_divHigh_pvalue, dat_results$pastInt_divHigh_pvalue), method = "BH")# n <- nrow(dat_results)# dat_results <- dat_results %>%# mutate(freshInt_pvalue_correct = p_corrected[1:n],# pastInt_pvalue_correct = p_corrected[(n+1):(2*n)],# freshInt_ageLow_pvalue_correct = pAgeInt_corrected[1:n],# pastInt_ageLow_pvalue_correct = pAgeInt_corrected[(n+1):(2*n)],# freshInt_ageHigh_pvalue_correct = pAgeInt_corrected[(2*n+1):(3*n)],# pastInt_ageHigh_pvalue_correct = pAgeInt_corrected[(3*n+1):(4*n)],# freshInt_sexM_pvalue_correct = pSexInt_corrected[1:n],# pastInt_sexM_pvalue_correct = pSexInt_corrected[(n+1):(2*n)],# freshInt_sexW_pvalue_correct = pSexInt_corrected[(2*n+1):(3*n)],# pastInt_sexW_pvalue_correct = pSexInt_corrected[(3*n+1):(4*n)],# freshInt_bmiLow_pvalue_correct = pBMIInt_corrected[1:n],# pastInt_bmiLow_pvalue_correct = pBMIInt_corrected[(n+1):(2*n)],# freshInt_bmiHigh_pvalue_correct = pBMIInt_corrected[(2*n+1):(3*n)],# pastInt_bmiHigh_pvalue_correct = pBMIInt_corrected[(3*n+1):(4*n)],# freshInt_fibLow_pvalue_correct = pFibInt_corrected[1:n],# pastInt_fibLow_pvalue_correct = pFibInt_corrected[(n+1):(2*n)],# freshInt_fibHigh_pvalue_correct = pFibInt_corrected[(2*n+1):(3*n)],# pastInt_fibHigh_pvalue_correct = pFibInt_corrected[(3*n+1):(4*n)],# freshInt_divLow_pvalue_correct = pDivInt_corrected[1:n],# pastInt_divLow_pvalue_correct = pDivInt_corrected[(n+1):(2*n)],# freshInt_divHigh_pvalue_correct = pDivInt_corrected[(2*n+1):(3*n)],# pastInt_divHigh_pvalue_correct = pDivInt_corrected[(3*n+1):(4*n)]) %>%# mutate_at(c(2:13, 15:ncol(.)), function(x) { round(x, 4) })# # write.csv(dat_results, file = "8_species_regressionScreening_results.csv", row.names = FALSE)
Results
Overview of the numbers of significant genes
Code
data.frame("what"=c("No. of species with relevant abundance","No. of significant species (fresh intervention)","No. of significant species (past. intervention)"),"value"=c(nrow(dat_results),sum(dat_results$freshInt_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_pvalue_correct <0.1, na.rm =TRUE))) %>%kable() %>%kable_styling()
… alternatively, looking at the p-values before correction for multiple testing:
Code
data.frame("what"=c("No. of species with relevant abundance","No. of significant species (fresh intervention)","No. of significant species (past. intervention)"),"value"=c(nrow(dat_results),sum(dat_results$freshInt_pvalue <0.1, na.rm =TRUE),sum(dat_results$pastInt_pvalue <0.1, na.rm =TRUE))) %>%kable() %>%kable_styling()
Inspiration for adding the phylogenetic tree: https://stackoverflow.com/questions/52281227/adding-a-ggtree-object-to-already-existing-ggplot-with-shared-y-axis
data.frame("what"=c("No. of species with relevant abundance","No. of significant species (fresh intervention - age low)","No. of significant species (fresh intervention - age high)","No. of significant species (past. intervention - age low)","No. of significant species (past. intervention - age high)","No. of significant species (fresh intervention - gender male)","No. of significant species (fresh intervention - gender female)","No. of significant species (past. intervention - gender male)","No. of significant species (past. intervention - gender female)","No. of significant species (fresh intervention - BMI low)","No. of significant species (fresh intervention - BMI high)","No. of significant species (past. intervention - BMI low)","No. of significant species (past. intervention - BMI high)","No. of significant species (fresh intervention - fiber low)","No. of significant species (fresh intervention - fiber high)","No. of significant species (past. intervention - fiber low)","No. of significant species (past. intervention - fiber high)","No. of significant species (fresh intervention - baseline diversity low)","No. of significant species (fresh intervention - baseline diversity high)","No. of significant species (past. intervention - baseline diversity low)","No. of significant species (past. intervention - baseline diversity high)"),"value"=c(nrow(dat_results),sum(dat_results$freshInt_ageLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_ageHigh_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_ageLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_ageHigh_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_sexM_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_sexW_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_sexM_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_sexW_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_bmiLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_bmiHigh_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_bmiLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_bmiHigh_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_fibLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_fibHigh_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_fibLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_fibHigh_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_divLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$freshInt_divHigh_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_divLow_pvalue_correct <0.1, na.rm =TRUE),sum(dat_results$pastInt_divHigh_pvalue_correct <0.1, na.rm =TRUE))) %>%kable() %>%kable_styling()
what
value
No. of species with relevant abundance
362
No. of significant species (fresh intervention - age low)
1
No. of significant species (fresh intervention - age high)
3
No. of significant species (past. intervention - age low)
1
No. of significant species (past. intervention - age high)
2
No. of significant species (fresh intervention - gender male)
1
No. of significant species (fresh intervention - gender female)
1
No. of significant species (past. intervention - gender male)
0
No. of significant species (past. intervention - gender female)
0
No. of significant species (fresh intervention - BMI low)
1
No. of significant species (fresh intervention - BMI high)
2
No. of significant species (past. intervention - BMI low)
1
No. of significant species (past. intervention - BMI high)
1
No. of significant species (fresh intervention - fiber low)
1
No. of significant species (fresh intervention - fiber high)
1
No. of significant species (past. intervention - fiber low)
2
No. of significant species (past. intervention - fiber high)
0
No. of significant species (fresh intervention - baseline diversity low)
1
No. of significant species (fresh intervention - baseline diversity high)
1
No. of significant species (past. intervention - baseline diversity low)
1
No. of significant species (past. intervention - baseline diversity high)
0
… alternatively, the numbers of significant genes based on uncorrected p-values
Code
data.frame("what"=c("No. of species with relevant abundance","No. of significant species (fresh intervention - age low)","No. of significant species (fresh intervention - age high)","No. of significant species (past. intervention - age low)","No. of significant species (past. intervention - age high)","No. of significant species (fresh intervention - gender male)","No. of significant species (fresh intervention - gender female)","No. of significant species (past. intervention - gender male)","No. of significant species (past. intervention - gender female)","No. of significant species (fresh intervention - BMI low)","No. of significant species (fresh intervention - BMI high)","No. of significant species (past. intervention - BMI low)","No. of significant species (past. intervention - BMI high)","No. of significant species (fresh intervention - fiber low)","No. of significant species (fresh intervention - fiber high)","No. of significant species (past. intervention - fiber low)","No. of significant species (past. intervention - fiber high)","No. of significant species (fresh intervention - baseline diversity low)","No. of significant species (fresh intervention - baseline diversity high)","No. of significant species (past. intervention - baseline diversity low)","No. of significant species (past. intervention - baseline diversity high)"),"value"=c(nrow(dat_results),sum(dat_results$freshInt_ageLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_ageHigh_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_ageLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_ageHigh_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_sexM_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_sexW_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_sexM_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_sexW_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_bmiLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_bmiHigh_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_bmiLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_bmiHigh_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_fibLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_fibHigh_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_fibLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_fibHigh_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_divLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$freshInt_divHigh_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_divLow_pvalue <0.05, na.rm =TRUE),sum(dat_results$pastInt_divHigh_pvalue <0.05, na.rm =TRUE))) %>%kable() %>%kable_styling()
what
value
No. of species with relevant abundance
362
No. of significant species (fresh intervention - age low)
14
No. of significant species (fresh intervention - age high)
27
No. of significant species (past. intervention - age low)
26
No. of significant species (past. intervention - age high)
21
No. of significant species (fresh intervention - gender male)
19
No. of significant species (fresh intervention - gender female)
24
No. of significant species (past. intervention - gender male)
21
No. of significant species (past. intervention - gender female)
24
No. of significant species (fresh intervention - BMI low)
25
No. of significant species (fresh intervention - BMI high)
23
No. of significant species (past. intervention - BMI low)
32
No. of significant species (past. intervention - BMI high)
21
No. of significant species (fresh intervention - fiber low)
22
No. of significant species (fresh intervention - fiber high)
21
No. of significant species (past. intervention - fiber low)
29
No. of significant species (past. intervention - fiber high)
18
No. of significant species (fresh intervention - baseline diversity low)
23
No. of significant species (fresh intervention - baseline diversity high)
21
No. of significant species (past. intervention - baseline diversity low)
20
No. of significant species (past. intervention - baseline diversity high)
First read additional data to correlate species with SCFA-related genes
Code
# read the Excel file from Till Robin, mapping individual genes to specific SCFAspath_SCFAgenes <-"9_lookup_SCFA-KEGG-KO.xlsx"dat_scfaLookup <-read_excel(path_SCFAgenes, sheet =1) %>% dplyr::select(SCFA, KO)
# create one big joint table with one row per sampledat_species_wide <- dat_species %>%mutate(species =paste0("species_", species)) %>% tidyr::pivot_wider(id_cols = SampleID, names_from = species, values_from = relAbd)dat_genes_wide <- dat_genes %>%mutate(gene =paste0("gene_", gene)) %>% tidyr::pivot_wider(id_cols = SampleID, names_from = gene, values_from = tpmAbd)dat_speciesGenes <- dat_species_wide %>% dplyr::left_join(dat_genes_wide, by ="SampleID")
Compare 89 SCFA-related genes with relevant abundance with 362 species with relevant abundance.
Code
# create a correlation matrix with species in rows and genes in columnscor_mat <-cor(x = dat_speciesGenes %>%select(starts_with("species_")),y = dat_speciesGenes %>%select(starts_with("gene_")))# pretty up the column and row namescolnames(cor_mat) <-colnames(cor_mat) %>%gsub(pattern ="gene_", replacement ="")row.names(cor_mat) <-row.names(cor_mat) %>%gsub(pattern ="species_", replacement ="")# reformat the correlation matrix to a ggplot'able data.frameplot_dat <-data.frame(species =rep(row.names(cor_mat), times =ncol(cor_mat)),gene =rep(colnames(cor_mat), each =nrow(cor_mat)),corr =as.vector(cor_mat))# add species and gene categories to the dataplot_dat <- plot_dat %>% dplyr::left_join(dat_scfaLookup, by =c("gene"="KO")) %>% dplyr::left_join(dat_species %>%select(phylum, species) %>%distinct(species, .keep_all =TRUE), by ="species") %>%mutate(phylum =droplevels(phylum))# sort the genes according to the average highest correlationsgenes_sort <- plot_dat %>%group_by(gene) %>%summarize(avg_corr =mean(corr)) %>%arrange(desc(avg_corr)) %>%pull(gene)plot_dat <- plot_dat %>%mutate(gene =factor(gene, levels = genes_sort))# sort species according to the correlation with the gene with the on average highest correlationsspecies_sort <- plot_dat %>%filter(gene =="K03737") %>%group_by(species) %>%summarize(avg_corr =mean(corr)) %>%arrange(desc(avg_corr)) %>%pull(species)plot_dat <- plot_dat %>%mutate(species =factor(species, levels =rev(species_sort)))